{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import KFold\n",
    "import gc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train= pd.read_csv('../../Large_output/train_clean_merge.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas.api.types import is_datetime64_any_dtype as is_datetime\n",
    "from pandas.api.types import is_categorical_dtype\n",
    "\n",
    "def reduce_mem_usage(df, use_float16=False):\n",
    "    \"\"\"\n",
    "    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        \n",
    "    \"\"\"\n",
    "    \n",
    "    start_mem = df.memory_usage().sum() / 1024**2\n",
    "    print(\"Memory usage of dataframe is {:.2f} MB\".format(start_mem))\n",
    "    \n",
    "    for col in df.columns:\n",
    "        if is_datetime(df[col]) or is_categorical_dtype(df[col]):\n",
    "            continue\n",
    "        col_type = df[col].dtype\n",
    "        \n",
    "        if col_type != object:\n",
    "            c_min = df[col].min()\n",
    "            c_max = df[col].max()\n",
    "            if str(col_type)[:3] == \"int\":\n",
    "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
    "                    df[col] = df[col].astype(np.int8)\n",
    "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
    "                    df[col] = df[col].astype(np.int16)\n",
    "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
    "                    df[col] = df[col].astype(np.int32)\n",
    "                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
    "                    df[col] = df[col].astype(np.int64)  \n",
    "            else:\n",
    "                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
    "                    df[col] = df[col].astype(np.float16)\n",
    "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
    "                    df[col] = df[col].astype(np.float32)\n",
    "                else:\n",
    "                    df[col] = df[col].astype(np.float64)\n",
    "        else:\n",
    "            df[col] = df[col].astype(\"category\")\n",
    "\n",
    "    end_mem = df.memory_usage().sum() / 1024**2\n",
    "    print(\"Memory usage after optimization is: {:.2f} MB\".format(end_mem))\n",
    "    print(\"Decreased by {:.1f}%\".format(100 * (start_mem - end_mem) / start_mem))\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def features_engineering(df):\n",
    "    \n",
    "    # Sort by localtime\n",
    "    df.sort_values(\"local_time\")\n",
    "    df.reset_index(drop=True)\n",
    "    \n",
    "    # Add more features\n",
    "    df[\"local_time\"] = pd.to_datetime(df[\"local_time\"],format=\"%Y-%m-%d %H:%M:%S\")\n",
    "    df[\"hour\"] = df[\"local_time\"].dt.hour\n",
    "    df[\"weekend\"] = df[\"local_time\"].dt.weekday\n",
    "    df['square_feet'] =  np.log1p(df['square_feet'])\n",
    "    \n",
    "    \n",
    "    # Encode Categorical Data\n",
    "    le = LabelEncoder()\n",
    "    df[\"primary_use\"] = le.fit_transform(df[\"primary_use\"])\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage of dataframe is 2638.86 MB\n",
      "Memory usage after optimization is: 733.78 MB\n",
      "Decreased by 72.2%\n"
     ]
    }
   ],
   "source": [
    "# reduce memory usage\n",
    "df_train = reduce_mem_usage(df_train,use_float16=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train feature engineering and change the site_0 unit\n",
    "train_engineer = features_engineering(df_train)\n",
    "train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading']\\\n",
    "=train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading'].mul(0.2931)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get the target and features\n",
    "target = np.log1p(df_train[\"meter_reading\"])\n",
    "features = df_train[['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\\\n",
    "                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import catboost as cgb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define the function for optimization and setup the parameter range\n",
    "def cat_hyp(depth, bagging_temperature):\n",
    "    params = {'iterations': 1000, 'learning_rate': 0.05, 'eval_metric': 'RMSE', 'loss_function': 'RMSE', \n",
    "             'verbose': False, 'metric_period': 100, 'task_type': 'GPU'}\n",
    "    params['depth'] = int(round(depth))\n",
    "    params['bagging_temperature'] = bagging_temperature\n",
    "    cat_feat = [\"building_id\", \"site_id\", \"meter\", \n",
    "                        \"primary_use\",  \"weekend\",'is_holiday']\n",
    "    cv_dataset = cgb.Pool(data=features, label = target, cat_features = cat_feat)\n",
    "    scores = cgb.cv(cv_dataset, params, fold_count = 3)\n",
    "    return -1.0*np.min(scores['test-RMSE-mean'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# setup the parameter opt range\n",
    "pds = {'depth': (1,12), 'bagging_temperature': (3,10)}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bayes_opt import BayesianOptimization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "|   iter    |  target   | baggin... |   depth   |\n",
      "-------------------------------------------------\n",
      "| \u001b[0m 1       \u001b[0m | \u001b[0m-0.8063  \u001b[0m | \u001b[0m 5.622   \u001b[0m | \u001b[0m 11.46   \u001b[0m |\n",
      "| \u001b[0m 2       \u001b[0m | \u001b[0m-0.9055  \u001b[0m | \u001b[0m 8.124   \u001b[0m | \u001b[0m 7.585   \u001b[0m |\n",
      "| \u001b[0m 3       \u001b[0m | \u001b[0m-1.024   \u001b[0m | \u001b[0m 4.092   \u001b[0m | \u001b[0m 2.716   \u001b[0m |\n",
      "| \u001b[0m 4       \u001b[0m | \u001b[0m-0.8536  \u001b[0m | \u001b[0m 10.0    \u001b[0m | \u001b[0m 12.0    \u001b[0m |\n",
      "| \u001b[0m 5       \u001b[0m | \u001b[0m-0.8243  \u001b[0m | \u001b[0m 3.0     \u001b[0m | \u001b[0m 12.0    \u001b[0m |\n",
      "| \u001b[95m 6       \u001b[0m | \u001b[95m-0.8     \u001b[0m | \u001b[95m 6.155   \u001b[0m | \u001b[95m 12.0    \u001b[0m |\n",
      "| \u001b[95m 7       \u001b[0m | \u001b[95m-0.7945  \u001b[0m | \u001b[95m 5.479   \u001b[0m | \u001b[95m 12.0    \u001b[0m |\n",
      "| \u001b[0m 8       \u001b[0m | \u001b[0m-1.461   \u001b[0m | \u001b[0m 10.0    \u001b[0m | \u001b[0m 1.0     \u001b[0m |\n",
      "| \u001b[0m 9       \u001b[0m | \u001b[0m-0.8403  \u001b[0m | \u001b[0m 3.0     \u001b[0m | \u001b[0m 7.246   \u001b[0m |\n",
      "| \u001b[0m 10      \u001b[0m | \u001b[0m-0.7985  \u001b[0m | \u001b[0m 4.769   \u001b[0m | \u001b[0m 12.0    \u001b[0m |\n",
      "| \u001b[0m 11      \u001b[0m | \u001b[0m-0.7947  \u001b[0m | \u001b[0m 5.312   \u001b[0m | \u001b[0m 12.0    \u001b[0m |\n",
      "| \u001b[95m 12      \u001b[0m | \u001b[95m-0.7944  \u001b[0m | \u001b[95m 5.335   \u001b[0m | \u001b[95m 12.0    \u001b[0m |\n",
      "=================================================\n"
     ]
    }
   ],
   "source": [
    "# range opt\n",
    "optimizer = BayesianOptimization(cat_hyp, pds, random_state = 42)\n",
    "optimizer.maximize(init_points = 3, n_iter=9)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# best is 5.335  12.0"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
